This notebook will hold all analytics related code on this dataset. There are a list of questions I would like to answer and provide meaningfull visualization of the answers. Some questions are:
In [209]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unidecode import unidecode
import time
from bokeh.charts import Bar, output_file, show
from bokeh.sampledata.autompg import autompg as df
from bokeh.io import output_notebook, show
output_notebook()
jobs = pd.read_csv('../data/stackoverflow_jobs_enhanced.csv', thousands=',')
technologies = pd.read_csv('../data/technologies.csv')
# this is needed for excel export
jobs.country = jobs.country.astype(str)
jobs.city = jobs.city.astype(str)
jobs['city']=jobs['city'].apply( lambda x: unidecode(unicode(x, encoding = "utf-8")))
jobs['country']=jobs['country'].apply( lambda x: unidecode(unicode(x, encoding = "utf-8")))
In [210]:
top_cities = jobs.groupby(['city'])['jobid'].count().sort_values(ascending=False)
top_cities = top_cities.nlargest(15)
# create a data frame out of the series.
# found this trick at stackoverflow:
# http://stackoverflow.com/questions/10373660/converting-a-pandas-groupby-object-to-dataframe
df_cities = pd.DataFrame({'count' : top_cities}).reset_index()
In [211]:
p = Bar(df_cities, values='count', title='Jobs posted by citites', legend='top_right', label=CatAttr(columns=['city'], sort=False),)
# Need to add different color for different cities
show(p)
In [212]:
top_countries = jobs.groupby(['country'])['jobid'].count().sort_values(ascending=False)
top_countries = top_countries.nlargest(15)
df_countries = top_countries.to_frame('count').reset_index()
top_countries
Out[212]:
In [213]:
bar_countries = Bar(df_countries, values='count', title='Jobs posted by countries', legend='top_right', label=CatAttr(columns=['country'], sort=False),)
# Need to add different color for different cities
show(bar_countries)
In [214]:
# London top technologies
london_tech = technologies[technologies.city == 'London'].groupby(['city', 'tech'])['jobid'].count().sort_values(ascending=False)
london_tech.nlargest(10)
Out[214]:
In [205]:
# Amsterdam top technologies
amsterdam_tech = technologies[technologies.city == 'Amsterdam'].groupby(['city', 'tech'])['jobid'].count().sort_values(ascending=False)
amsterdam_tech.nlargest(10)
Out[205]:
In [206]:
# Berlin
berlin_tech = technologies[technologies.city == 'Berlin'].groupby(['city', 'tech'])['jobid'].count().sort_values(ascending=False)
berlin_tech.nlargest(10)
Out[206]:
In [207]:
# Silicon Valley
cal_tech = technologies[technologies.state == 'CA'].groupby('tech')['jobid'].count().sort_values(ascending=False)
cal_tech.nlargest(10)
Out[207]:
In [166]:
# Ordered view of what technologies are the most sought after in wich city
all_tech = technologies.groupby(['city', 'tech'])['jobid'].count().sort_values(ascending=False)
amsterdam_tech.nlargest(100)
Out[166]:
In [26]:
cities.to_frame('city').to_csv('../data/cities.csv', encoding = 'utf-8')
countries.to_frame('countries').to_csv('../data/countries.csv', encoding = 'utf-8')
In [168]:
ew = pd.ExcelWriter('../data/stackjobs.xlsx',options={'encoding':'utf-8'})
df_cities.to_excel(ew, 'City')
df_countries.to_excel(ew, 'Country')
ew.save()
In [ ]: